The dataset Retail Data Analytics is avaliable on Kaggle and provides with historical sales data for 45 stores located in different regions - each store contains a number of departments.
Anonymized information about the 45 stores, indicating the type and size of store
Contains additional data related to the store, department, and regional activity for the given dates.
Historical sales data, which covers to 2010-02-05 to 2012-11-01. Within this tab you will find the following fields:
sales <- read_csv("retail-data-analytics/sales data-set.csv")
## Parsed with column specification:
## cols(
## Store = col_double(),
## Dept = col_double(),
## Date = col_character(),
## Weekly_Sales = col_double(),
## IsHoliday = col_logical()
## )
features <- read_csv("retail-data-analytics/Features data set.csv")
## Parsed with column specification:
## cols(
## Store = col_double(),
## Date = col_character(),
## Temperature = col_double(),
## Fuel_Price = col_double(),
## MarkDown1 = col_double(),
## MarkDown2 = col_double(),
## MarkDown3 = col_double(),
## MarkDown4 = col_double(),
## MarkDown5 = col_double(),
## CPI = col_double(),
## Unemployment = col_double(),
## IsHoliday = col_logical()
## )
stores <- read_csv("retail-data-analytics/stores data-set.csv")
## Parsed with column specification:
## cols(
## Store = col_double(),
## Type = col_character(),
## Size = col_double()
## )
sales_complete <- merge(sales, stores, by="Store")
sales_complete <- merge(sales_complete, features, by=c("Store","Date", "IsHoliday"))
head(sales_complete)
## Store Date IsHoliday Dept Weekly_Sales Type Size Temperature
## 1 1 01/04/2011 FALSE 49 13167.85 A 151315 59.17
## 2 1 01/04/2011 FALSE 26 5946.53 A 151315 59.17
## 3 1 01/04/2011 FALSE 81 28545.23 A 151315 59.17
## 4 1 01/04/2011 FALSE 34 9949.54 A 151315 59.17
## 5 1 01/04/2011 FALSE 59 316.86 A 151315 59.17
## 6 1 01/04/2011 FALSE 30 3897.48 A 151315 59.17
## Fuel_Price MarkDown1 MarkDown2 MarkDown3 MarkDown4 MarkDown5 CPI
## 1 3.524 NA NA NA NA NA 214.8372
## 2 3.524 NA NA NA NA NA 214.8372
## 3 3.524 NA NA NA NA NA 214.8372
## 4 3.524 NA NA NA NA NA 214.8372
## 5 3.524 NA NA NA NA NA 214.8372
## 6 3.524 NA NA NA NA NA 214.8372
## Unemployment
## 1 7.682
## 2 7.682
## 3 7.682
## 4 7.682
## 5 7.682
## 6 7.682
## Classes 'spec_tbl_df', 'tbl_df', 'tbl' and 'data.frame': 421570 obs. of 5 variables:
## $ Store : num 1 1 1 1 1 1 1 1 1 1 ...
## $ Dept : num 1 1 1 1 1 1 1 1 1 1 ...
## $ Date : Date, format: "2010-02-05" "2010-02-12" ...
## $ Weekly_Sales: num 24925 46039 41596 19404 21828 ...
## $ IsHoliday : num 0 1 0 0 0 0 0 0 0 0 ...
## - attr(*, "spec")=
## .. cols(
## .. Store = col_double(),
## .. Dept = col_double(),
## .. Date = col_character(),
## .. Weekly_Sales = col_double(),
## .. IsHoliday = col_logical()
## .. )
sales_complete_subset <- subset(sales_complete, Store >= 1 & Store <= 5)
sales_subset <- subset(sales, Store >= 1 & Store <= 5)
sales_store1_dept1 <- sales_subset[which(sales_subset$Store==1 & sales_subset$Dept == 1),]
sales_store1_dept2 <- sales_subset[which(sales_subset$Store==1 & sales_subset$Dept == 2),]
sales_stores1to5_dept1 <- sales_subset[which(sales_subset$Dept == 1),]
p <- plot_ly(sales_complete, x = ~log(Weekly_Sales), color = ~Type, type = "box")
p
## Warning in log(Weekly_Sales): NaNs produced
## Warning: Ignoring 1285 observations
p <- plot_ly(sales_subset, x = ~Weekly_Sales, color = ~Store, type = "box")
p
## Warning: line.color doesn't (yet) support data arrays
## Warning: Only one fillcolor per trace allowed
## Warning in min(x, na.rm = na.rm): no non-missing arguments to min;
## returning Inf
## Warning in max(x, na.rm = na.rm): no non-missing arguments to max;
## returning -Inf
## Warning: line.color doesn't (yet) support data arrays
## Warning: Only one fillcolor per trace allowed
plot_ly(sales_store1_dept1,
x = ~Date, y = ~ Weekly_Sales)
## No trace type specified:
## Based on info supplied, a 'scatter' trace seems appropriate.
## Read more about this trace type -> https://plot.ly/r/reference/#scatter
## No scatter mode specifed:
## Setting the mode to markers
## Read more about this attribute -> https://plot.ly/r/reference/#scatter-mode
plot_ly(sales_store1_dept1,
x = ~Date,
y = ~Weekly_Sales,
type = 'scatter',
mode = 'lines') %>%
layout(title = "Weekly Sales: Store 1 and Dept 1")
p1 <- plot_ly(sales_store1_dept1, x = ~Date, y = ~Weekly_Sales) %>%
add_lines(name = "Store 1 Dept 1")
p2 <- plot_ly(sales_store1_dept2, x = ~Date, y = ~Weekly_Sales) %>%
add_lines(name = "Store 1 Dept 2")
subplot(p1, p2)
Before we use the subplot function, we need to change our data format from long to wide using the function spread.
sales_store1_dept1and2_wide <- spread(sales_stores1to5_dept1, Store, Weekly_Sales)
# select only the columns to create the plot
sales_store1_dept1and2_wide <- sales_store1_dept1and2_wide[c("Date",1:5)]
# rename the columns
names(sales_store1_dept1and2_wide) <- c("Date", "Store1","Store2","Store3","Store4", "Store5")
vars <- setdiff(names(sales_store1_dept1and2_wide), "Date")
plots <- lapply(vars, function(var) {
plot_ly(sales_store1_dept1and2_wide, x = ~Date, y = as.formula(paste0("~", var))) %>%
add_lines(name = var)
})
subplot(plots, nrows = length(plots), shareX = TRUE, titleX = FALSE)
Now, it is your turn to create a beautiful HTML report from the provided Dataset in Rmarkown.Add in your report at least one
This presentation was based on